#### Wordfish Model Averaging ####

# Set your working directory to Data subdirectory of the replication repo. For
# us, this looks like:
setwd("~/Desktop/Replication_Materials")

# Install preText:
install.packages("preText")

# Load the packages:
library(preText)
library(ggplot2)
library(reshape2)
library(grid)
library(gridExtra)
library(cowplot)

# Load the data:
load("./Data/UK_Manifestos.RData")

# load in labels for plotting:
load("./Data/128_Combination_Preprocessing_Labels.RData")

# Preprocess the data (note we use a higher threshold here since there are only
# 69 documents and we want to exclude terms that do not appear in at least two
# of them):
uk_manifestos_fact_prep <- factorial_preprocessing(documents,
                                                   infrequent_term_threshold = 0.02)


# fix document names by removing .txt ending
remove_txt <- function(str){
    stringr::str_split(str,"\\.")[[1]][1]
}
for(i in 1:128){
    uk_manifestos_fact_prep$dfm_list[[i]]@Dimnames$docs <- as.character(
        sapply(uk_manifestos_fact_prep$dfm_list[[i]]@Dimnames$docs,remove_txt))
}

# Get the year each document was written and store them as a numeric vector:
dfm <- uk_manifestos_fact_prep$dfm_list[[1]]
rl <- function(str) {
    stringr::str_replace_all(str,"[A-Za-z\\.]+","")
}
years <- as.numeric(sapply(rownames(dfm),rl))

# use the wordfish_comparison function to compare all dfms. We are using
# conservative and labour manifestos from 1983, 1987, 1992, and 1997 for a total
# of 8 manifestos. These are indicated by the document_inidices = c(42:45,19:22)
# argument. You can see the document names by entering rownames(dfm) into the
# console. We need to set the anchors to 1,5 because anchoring is applied in the
# reduced dfm. We are also only including terms that appear at least once in a
# manifesto from each of the 4 years, to deal with the strong temporal effects.
wordfish_results <- wordfish_comparison(
    uk_manifestos_fact_prep$dfm_list,
    years,
    anchors = c(1,5),
    proportion_threshold = 1,
    document_inidices = c(42:45,19:22))


# function to generate coefficient plots:
generate_plot <- function(data,
                          text_size = 1,
                          remove_intercept = FALSE,
                          yname = NULL) {
    #define colors
    Model <- Variable <- Coefficient <- SE  <- NULL

    # if we are only using the one model, proceed as normal.

    if (remove_intercept) {
        data <- data[-which(data$Variable == "Intercept"),]
    }

    # Dark2
    zp1 <- ggplot2::ggplot(data, ggplot2::aes(colour = Model)) +
        ggplot2::scale_colour_brewer(palette = "Set1") +
        ggplot2::theme(axis.text = ggplot2::element_text(size = text_size))
    zp1 <- zp1 + ggplot2::geom_vline(xintercept = 0,
                                     colour = gray(1/2),
                                     lty = 2)
    zp1 <- zp1 + ggplot2::geom_segment(
        ggplot2::aes(y = Variable,
                     yend = Variable,
                     x = Coefficient - SE*(-qnorm((1 - 0.95)/2)),
                     xend = Coefficient + SE*(-qnorm((1 - 0.95)/2))),
        lwd = 1)
    zp1 <- zp1 + ggplot2::geom_point(
        ggplot2::aes(y = Variable,
                     x = Coefficient),
        lwd = 2,
        shape = 21, fill = "WHITE")

    zp1 <- zp1  + ggplot2::theme_bw() +
        facet_grid(~ Model, scales = "free_x" ) +
        ggplot2::theme(legend.position = "none",
                       axis.text.x=ggplot2::element_text(angle=-45, hjust=0),
                       axis.title.y=ggplot2::element_text(angle=-90, size=20, face="bold")) +
        ggplot2::ylab(yname) +
        ggplot2::xlab("Wordfish Score")

    print(zp1)
}

# Function to calculate averaged wordfish results:
calculate_averaged_thetas_SEs <- function(thetas,
                                          ses,
                                          weights = NULL) {

    if (is.null(weights)) {
        weights <- rep(1/length(thetas),length(thetas))
    }

    # take average
    ave_theta <- sum(thetas * weights)

    ave_SE <- 0
    for (i in 1:length(ses)) {
        ave_SE <- ave_SE + (weights[i] * (ses[i]^2 + (thetas[i] - ave_theta)^2))
    }
    ave_SE <- sqrt(ave_SE)

    return(c(ave_theta,ave_SE))
}

# Our a priori rankings of documents:
ranking = c("Lab1983","Lab1987","Lab1992","Lab1997",
            "Con1992", "Con1997","Con1987","Con1983")

# get the indices for the reduced choice set of steps for which we do not have
# strong theory and for which the pretext regression results indicate we need
# to check:
choices <- uk_manifestos_fact_prep$choices
inds <- which(!choices$use_ngrams & choices$infrequent_terms &
                  choices$lowercase & choices$removeNumbers)

# Reduce down the data frame:
choices <- choices[inds,]

# Get a list of wordfish results for these 8 combinations of preprocessing steps:
score_list <- wordfish_results$results_by_dfm[inds]

scores <- matrix(0,nrow= 8, ncol = 8)
ses <- matrix(0,nrow= 8, ncol = 8)
# now reorder estimates based on theoretical expectation and put them in a data
# frame:
for (i in 1:8) {
    for (j in 1:8) {
        ind <- which(score_list[[i]]$document == ranking[j])
        scores[i,j] <- score_list[[i]]$score[ind]
        ses[i,j] <- score_list[[i]]$std_error[ind]
    }
}

# now get the average wordfish scores across all 8 preprocessing specifications:
averaged <- data.frame(Coefficient = rep(0,8),
                       SE = rep(0,8),
                       Variable = ranking,
                       Model = rep("Averaged", 8))
for (i in 1:8) {
    averaged[i,1:2] <-  calculate_averaged_thetas_SEs(scores[,i],
                                                      ses[,i])
}

# create a data frame for just the estimates from our theoretically selected
# preprocessing specification:
chosen <- data.frame(Coefficient = scores[1,],
                     SE = ses[1,],
                     Variable = ranking,
                     Model = rep("P-N-L-S-W-I", 8))

# Combine the theoretical and averaged results together:
data <- rbind(chosen,averaged)

# Make sure that we have the correct rankings for the documents:
data$Variable <- factor(data$Variable, levels = ranking)

# Generate the plot:
pdf(file = "Selected_vs_Averaged_Wordfish_Scores.pdf",
    width = 8,
    height = 2.5)
generate_plot(data)
dev.off()
